/****************************************************************************/
/***                                                                      ***/
/***   (C) 2012-2014 Dierk Ohlerich et al., all rights reserved.          ***/
/***                                                                      ***/
/***   Released under BSD 2 clause license, see LICENSE.TXT               ***/
/***                                                                      ***/
/****************************************************************************/

mtrl ComputeShader 
{
    cbuffer csb0 : cs(0) : register(0)
    {
        row_major float3x4 Matrix;
        float PosMul;
        float PosAdd;
        float Size;
        float Delta;
    };

    cs hlsl5
    {
        // DImensions

        static const uint GridSize = 16;                // group size
        static const uint GroupSize = GridSize*GridSize;
        static const uint VertSize = GridSize;        // vertices in group
        static const uint CubeSize = VertSize-1;        // cubes in group (one less than verts
        static const uint ZMask = 3;
        static const uint BufferPerGroup = 256*1024;

        // resources

        RWByteAddressBuffer vb : register(u0);
        RWByteAddressBuffer ib : register(u1);
        RWStructuredBuffer<uint> NodeCounter : register(u2);
        RWBuffer<uint> IndirectBuffer : register(u3);
        RWByteAddressBuffer TempBuffer : register(u4);

        Buffer<uint> mcmap : register(t0);
        Buffer<uint4> nodes : register(t1);

        // group local 

        groupshared float pot[ZMask+1][GridSize][GridSize];
        groupshared int index[2*VertSize*VertSize*3];
        groupshared uint NodeCounterShared;
        groupshared uint SharedVertex;
        groupshared uint SharedIndex;
        groupshared uint GlobalVertex;
        groupshared uint GlobalIndex;

        // iso helpers

        float maptorus(float3 p)
        {
            return length(float2(length(p.xz)-0.75,p.y))-0.25;
        }

        float mapbox(float3 p)
        {
            return max(max(abs(p.x)-0.9,abs(p.y)-0.2),abs(p.z)-0.1);
        }

        float mapbulb(float3 pos)
        {
            float Power = 8;
            float3 z = pos;
            float dr = 1.0;
            float r = 0.0;
            for(uint i=0;i<13;i++)
            {
                i = i+1;
                r = length(z);
                if(r>2) break;
                dr = pow(r,Power-1.0)*Power*dr+1.0;

                float theta = asin(z.z/r);
                float phi = atan2(z.y,z.x);

                float zr = pow(r,Power);
                theta = theta *Power;
                phi = phi*Power;
                z = zr*float3(cos(theta)*cos(phi),sin(phi)*cos(theta),sin(theta));

                z+=pos;
            }
            return 0.5*log(r)*r/dr;
        }

        float maptemple(float3 pos)
        {
            pos.y += 3.4;
//            pos *= 0.5;

            float Scale = 2.0;
            float MinRadius = 0.4;
            float FixedRadius = 1.0;
            float3 AbsVector = float3(0.0,2.0,0.0);
            float3 Vector = float3(0.5,1.0,0.5);
            float RenderRadius = 0.0005;


            float def = Scale; // 1.0;
            float3 p = pos;
            for(uint i=0;i<14;i++)
            {
                p = abs(p)-AbsVector;  
                float sc=Scale/clamp(dot(p, p),MinRadius,FixedRadius);
                p*=sc; 
                def*=sc;
                p = p - Vector;
            }
            return (length(p)/def-RenderRadius)*1;
        }

        float map(float3 p)
        {
//            return maptorus(mul(Matrix,float4(p,1)));
//            return mapbox(mul(Matrix,float4(p,1));)
            return maptemple(mul(Matrix,float4(p,1)));
//            return mapbulb(mul(Matrix,float4(p,1)));
        }
        uint th(uint3 tid)
        {
            return (tid.z & ZMask) * GridSize * GridSize + tid.y * GridSize + tid.x;
        }

        float getpot(uint3 tid)
        {
            return pot[tid.z & ZMask][tid.y][tid.x];
        }

        float3 trans(uint3 tid)
        {   
            return ((float3)(tid))*PosMul+PosAdd;
        }

        float3 pnormal(float3 pos)
        {
//            float p0 = map(pos);
            float px = map(pos+float3(Delta,0,0))-map(pos+float3(-Delta,0,0));
            float py = map(pos+float3(0,Delta,0))-map(pos+float3(0,-Delta,0));
            float pz = map(pos+float3(0,0,Delta))-map(pos+float3(0,0,-Delta));
//            return normalize(float3(px-p0,py-p0,pz-p0));
            return normalize(float3(px,py,pz));
        }

        // big kahuna

        [numthreads(GridSize,GridSize,1)]
        void main(uint3 gtid_:SV_GroupThreadID,uint3 groupid_:SV_GroupID)
        {
            uint threadid = gtid_.x+gtid_.y*GridSize;
            uint groupid = groupid_.x;

        /*
            uint4 nodedata = nodes[groupid_.x+groupid_.y*4];
            {
            */
            
            while(1)
            {
                if(threadid==0)
                {
                    NodeCounterShared = NodeCounter.DecrementCounter();
                    SharedVertex = 0;
                    SharedIndex = 0;
                }
                GroupMemoryBarrierWithGroupSync();
                if(NodeCounterShared>0x80000000)
                    break;
                GroupMemoryBarrierWithGroupSync();



                uint4 nodedata = nodes[NodeCounterShared];

                uint3 gid_ = nodedata.xyz;

                uint3 world = gid_*CubeSize;                     // first thread position in world
                [loop]for(uint gidz=0;gidz<GridSize+1;gidz++)
                {
                    uint3 group = uint3(gtid_.xy,gidz);           // thread position in cube

                    // calculate potential

                    if(gidz<GridSize)
                    {
                        float3 pos = trans(world+group);
                        pot[group.z&ZMask][group.y][group.x] = map(pos);
                        GroupMemoryBarrierWithGroupSync();
                    }

                    // generate vertices

                    if(group.x<VertSize && group.y<VertSize && gidz>=1)
                    {
                        uint3 thread = group - uint3(0,0,1);
                        uint indexth = (thread.z&1)*VertSize*VertSize*3 + thread.y*VertSize*3 + thread.x*3;

                        float4 pot;
                        pot.w = getpot(thread+uint3(0,0,0));
                        pot.x = getpot(thread+uint3(1,0,0));
                        pot.y = getpot(thread+uint3(0,1,0));
                        pot.z = getpot(thread+uint3(0,0,1));
                        bool4 b = pot<0;

                        uint3 n = 0;
                        if(b.w!=b.x || b.w!=b.y || b.w!=b.z)
                        {
                            float3 p0 = trans(world+thread+uint3(0,0,0));

                            if(b.w!=b.x)
                            {
                                float3 p1 = trans(world+thread+uint3(1,0,0));
                                float f = pot.w/(pot.w-pot.x);
                                float3 pos = lerp(p0,p1,f);

                                InterlockedAdd(SharedVertex,1,n.x);
                                uint vp = groupid*BufferPerGroup + n.x*12;
                                TempBuffer.Store3(vp+0,asuint(pos));
                            }

                            if(b.w!=b.y)
                            {
                                float3 p1 = trans(world+thread+uint3(0,1,0));
                                float f = pot.w/(pot.w-pot.y);
                                float3 pos = lerp(p0,p1,f);

                                InterlockedAdd(SharedVertex,1,n.y);
                                uint vp = groupid*BufferPerGroup + n.y*12;
                                TempBuffer.Store3(vp+0,asuint(pos));
                            }

                            if(b.w!=b.z)
                            {
                                float3 p1 = trans(world+thread+uint3(0,0,1));
                                float f = pot.w/(pot.w-pot.z);
                                float3 pos = lerp(p0,p1,f);

                                InterlockedAdd(SharedVertex,1,n.z);
                                uint vp = groupid*BufferPerGroup + n.z*12;
                                TempBuffer.Store3(vp+0,asuint(pos));
                            }
                        }
                        index[indexth+0] = n.x;
                        index[indexth+1] = n.y;
                        index[indexth+2] = n.z;
                    }
                    GroupMemoryBarrierWithGroupSync();

                    // generate indices

                    if(group.x<CubeSize && group.y<CubeSize && gidz>=2)
                    {
                        uint3 thread = group-uint3(0,0,2);
                        uint indexth = thread.y*VertSize*3 + thread.x*3;

                        float4 pot0,pot1;

                        pot0.x = getpot(thread+uint3(0,0,0));
                        pot0.y = getpot(thread+uint3(1,0,0));
                        pot0.z = getpot(thread+uint3(1,1,0));
                        pot0.w = getpot(thread+uint3(0,1,0));
                        pot1.x = getpot(thread+uint3(0,0,1));
                        pot1.y = getpot(thread+uint3(1,0,1));
                        pot1.z = getpot(thread+uint3(1,1,1));
                        pot1.w = getpot(thread+uint3(0,1,1));

                        uint4 code4;
                        code4  = (pot0<0) ? uint4(0x01*5,0x02*5,0x04*5,0x08*5) : uint4(0,0,0,0);
                        code4 += (pot1<0) ? uint4(0x10*5,0x20*5,0x40*5,0x80*5) : uint4(0,0,0,0);
                        code4.xy = code4.yx+code4.zw;
                        uint code = code4.x+code4.y;
                        if((thread.z & 1))
                            code += 0x100*5;

                        for(int i=0;i<5 && mcmap[code]!=0xffffffff;i++)
                        {
                            uint offsets = mcmap[code];
                            code++;
                            uint3 ind;
                            ind.x = index[indexth+((offsets>>0 )&0x3ff)];
                            ind.y = index[indexth+((offsets>>10)&0x3ff)];
                            ind.z = index[indexth+((offsets>>20)&0x3ff)];

                            uint n; InterlockedAdd(SharedIndex,1,n);
                            TempBuffer.Store3((groupid+1)*BufferPerGroup-12-n*12,ind);
                        }
                    }
                    GroupMemoryBarrierWithGroupSync();
                }

                uint newvc = SharedVertex;
                uint newic = SharedIndex;

                if(threadid == 0)
                {
                    InterlockedAdd(IndirectBuffer[8],newvc,GlobalVertex);
                    InterlockedAdd(IndirectBuffer[9],newic,GlobalIndex);
                }
                GroupMemoryBarrierWithGroupSync();
                uint dvp = GlobalVertex;
                uint dip = GlobalIndex;
                DeviceMemoryBarrierWithGroupSync();

                if(newic>0)
                {
                    for(uint i=threadid;i<newvc;i+=GroupSize)
                    {
                        uint3 pos = TempBuffer.Load3(BufferPerGroup*groupid+i*12+0);
                        vb.Store3((dvp+i)*24+ 0,pos);
                        vb.Store3((dvp+i)*24+12,asuint(pnormal(asfloat(pos))));
                    }
                    for(uint j=threadid;j<newic;j+=GroupSize)
                    {
                        uint3 ind = TempBuffer.Load3(BufferPerGroup*(groupid+1)-12-j*12);
                        ib.Store3((dip+j)*12,ind+dvp);
                    }
                }
                GroupMemoryBarrierWithGroupSync();
            }
        }
    };
}

mtrl WriteCount
{
    cs hlsl5
    {
        RWStructuredBuffer<uint> cnt : register(u0);
        RWBuffer<uint> ind : register(u1);

        [numthreads(1,1,1)]
        void main(uint3 tid:SV_DispatchThreadID)
        {
            ind[0] = ind[9]*3;
            ind[1] = 1;
            ind[2] = 0;
            ind[3] = 0;
            ind[4] = 0;

            ind[8] = 0;
            ind[9] = 0;
        }
    };
}

/****************************************************************************/
